library(readr)
library(data.table)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(shiny)
setwd("C:/Users/anubh/OneDrive/Documents/GitHub/LiU/LiU-2018/Visualization/Assignment_1")
data_senic <- read_table2("Input/SENIC.txt", col_names = FALSE)
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## X2 = col_double(),
## X3 = col_double(),
## X4 = col_double(),
## X5 = col_double(),
## X6 = col_double(),
## X7 = col_integer(),
## X8 = col_integer(),
## X9 = col_integer(),
## X10 = col_integer(),
## X11 = col_integer(),
## X12 = col_double()
## )
setnames(data_senic, old = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12"),
new = c("ID", "length_of_stay", "avg_age", "avg_risk", "culturing_ratio", "chest_xray_ratio",
"avg_beds", "school_affil", "region", "avg_daily_census", "avg_nurses", "facility_per"))
# Fixing the datatypes
data_senic$school_affil <- factor(data_senic$school_affil, levels = c(1, 2),labels = c("Yes", "No"))
data_senic$region <- factor(data_senic$region, levels = c(1, 2, 3, 4),labels = c("NE", "NC", "S", "W"))
quant_cal <- function(x){
#x <- mtcars$hp
q1 <- as.numeric(quantile(x, probs = c(0.25)))
q3 <- as.numeric(quantile(x, probs = c(0.75)))
iqr <- q3-q1
upper_value <- q3+(1.5*iqr)
lower_value <- q1-(1.5*iqr)
outliers_index <- which(x < lower_value | x > upper_value)
#which(x %in% boxplot(x)$out) # must be equal to above
return(outliers_index)
}
ggplot(data=data_senic, aes(x=avg_risk)) + geom_density() + geom_point(data = data_senic[quant_cal(data_senic$avg_risk), c("avg_risk")], aes(y = 0, x = avg_risk), shape=18, size=2)
Analysis: We see that the number of outliers are fairly small, however this visualization is not very insightful
data_senic_numeric <- data_senic[,c("ID", "length_of_stay", "avg_age", "avg_risk", "culturing_ratio", "chest_xray_ratio",
"avg_beds", "avg_daily_census", "avg_nurses", "facility_per")]
p1 <- ggplot(data=data_senic_numeric, aes(x=length_of_stay)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$length_of_stay), c("length_of_stay")], aes(y = 0, x = length_of_stay), shape=18, size=2)
p2 <- ggplot(data=data_senic_numeric, aes(x=avg_age)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$avg_age), c("avg_age")], aes(y = 0, x = avg_age), shape=18, size=2)
p3 <- ggplot(data=data_senic_numeric, aes(x=culturing_ratio)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$culturing_ratio), c("culturing_ratio")], aes(y = 0, x = culturing_ratio), shape=18, size=2)
p4 <- ggplot(data=data_senic_numeric, aes(x=chest_xray_ratio)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$chest_xray_ratio), c("chest_xray_ratio")], aes(y = 0, x = chest_xray_ratio), shape=18, size=2)
p5 <- ggplot(data=data_senic_numeric, aes(x=avg_beds)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$avg_beds), c("avg_beds")], aes(y = 0, x = avg_beds), shape=18, size=2)
p6 <- ggplot(data=data_senic_numeric, aes(x=avg_daily_census)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$avg_daily_census), c("avg_daily_census")], aes(y = 0, x = avg_daily_census), shape=18, size=2)
p7 <- ggplot(data=data_senic_numeric, aes(x=avg_nurses)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$avg_nurses), c("avg_nurses")], aes(y = 0, x = avg_nurses), shape=18, size=2)
p8 <- ggplot(data=data_senic_numeric, aes(x=facility_per)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$facility_per), c("facility_per")], aes(y = 0, x = facility_per), shape=18, size=2)
p9 <- ggplot(data=data_senic_numeric, aes(x=avg_risk)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$avg_risk), c("avg_risk")], aes(y = 0, x = avg_risk), shape=18, size=2)
# final output
grid.arrange(p1, p2, p3, p4, p5, p6, p7, p9) # facility_per does not contain any outlier
Analysis: We see that the number of outliers are fairly small, however this visualization is not very insightful
ggplot(data = data_senic, aes(x = avg_nurses, y = avg_risk)) + geom_point(aes(color = avg_beds))
ggplotly(p9)
data = data_senic_numeric[quant_cal(data_senic_numeric$avg_risk), c("avg_risk")]
data_senic %>% select(avg_risk) %>% plot_ly(x = ~avg_risk, type = "histogram") %>%
add_markers(x = data$avg_risk, y = 0, showlegend = FALSE, marker = list(color = "red", symbol = c('diamond')))